{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## POLICY GRADIENT on CartPole"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Policy Gradient algorithms find an optimal behavior strategy optimizing directly the policy. \n",
    "The policy is a parametrized function respect to $\\theta$ $\\pi_\\theta(a|s)$\n",
    "\n",
    "The reward function is defined as \n",
    "$$J(\\theta) = \\sum_{s}d^\\pi(s)\\sum_{a}\\pi_\\theta(a|s)Q^\\pi(s,a)$$\n",
    "\n",
    "In Vanilla Policy Gradient, we estimate the return $R_t$ (REINFORCE algorithm) and update the policy subtracting a baseline value from $R_t$ to reduce the variance."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"imgs/Vanilla_policy_gradient.png\" alt=\"drawing\" width=\"500\"/>\n",
    "Credit: John Schulman"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import gym\n",
    "from tensorboardX import SummaryWriter\n",
    "\n",
    "import time\n",
    "from collections import namedtuple\n",
    "from collections import deque\n",
    "import datetime\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PG_nn(nn.Module):\n",
    "    '''\n",
    "    Policy neural net\n",
    "    '''\n",
    "    def __init__(self, input_shape, n_actions):\n",
    "        super(PG_nn, self).__init__()\n",
    "\n",
    "        self.mlp = nn.Sequential(\n",
    "            nn.Linear(input_shape[0], 64),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(64, n_actions))\n",
    "\n",
    "    def forward(self, x):\n",
    "        return self.mlp(x.float())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def discounted_rewards(memories, gamma):\n",
    "    '''\n",
    "    Compute the discounted reward backward\n",
    "    '''\n",
    "\n",
    "    disc_rew = np.zeros(len(memories))\n",
    "    run_add = 0\n",
    "\n",
    "    for t in reversed(range(len(memories))):\n",
    "        if memories[t].done: run_add = 0\n",
    "        run_add = run_add * gamma + memories[t].reward\n",
    "        disc_rew[t] = run_add\n",
    "\n",
    "    return disc_rew"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], verbose=False, rename=False)\n",
    "\n",
    "GAMMA = 0.99\n",
    "LEARNING_RATE = 0.002\n",
    "ENTROPY_BETA = 0.01\n",
    "ENV_NAME = 'CartPole-v0'\n",
    "\n",
    "MAX_N_GAMES = 10000\n",
    "n_games = 0\n",
    "\n",
    "device = 'cpu'\n",
    "\n",
    "now = datetime.datetime.now()\n",
    "date_time = \"{}_{}.{}.{}\".format(now.day, now.hour, now.minute, now.second)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "env = gym.make(ENV_NAME)\n",
    "obs = env.reset()\n",
    "\n",
    "# Initialize the writer\n",
    "writer = SummaryWriter(log_dir='content/runs/A2C'+ENV_NAME+'_'+date_time)\n",
    "\n",
    "# create the agent neural net\n",
    "action_n = env.action_space.n\n",
    "agent_nn = PG_nn(env.observation_space.shape, action_n).to(device)\n",
    "\n",
    "# Adam optimizer\n",
    "optimizer = optim.Adam(agent_nn.parameters(), lr=LEARNING_RATE)\n",
    "\n",
    "experience = []\n",
    "tot_reward = 0\n",
    "n_iter = 0\n",
    "# deque list to keep the baseline\n",
    "baseline = deque(maxlen=30000)\n",
    "game_rew = 0\n",
    "\n",
    "## MAIN BODY\n",
    "while n_games < MAX_N_GAMES:\n",
    "\n",
    "    n_iter += 1\n",
    "\n",
    "    # execute the agent\n",
    "    act = agent_nn(torch.tensor(obs))\n",
    "    act_soft = F.softmax(act)\n",
    "    # get an action following the policy distribution\n",
    "    action = int(np.random.choice(np.arange(action_n), p=act_soft.detach().numpy(), size=1))\n",
    "\n",
    "    # make a step in the env\n",
    "    new_obs, reward, done, _ = env.step(action)\n",
    "\n",
    "    game_rew += reward\n",
    "    # update the experience list with the last memory\n",
    "    experience.append(Memory(obs=obs, action=action, new_obs=new_obs, reward=reward, done=done))\n",
    "\n",
    "    obs = new_obs\n",
    "\n",
    "    if done:\n",
    "        # Calculate the discounted rewards\n",
    "        disc_rewards = discounted_rewards(experience, GAMMA)\n",
    "\n",
    "        # update the baseline\n",
    "        baseline.extend(disc_rewards)\n",
    "        # subtract the baseline mean from the discounted reward.\n",
    "        disc_rewards -= np.mean(baseline)\n",
    "\n",
    "        # run the agent NN on the obs in the experience list\n",
    "        acts = agent_nn(torch.tensor([e.obs for e in experience]))\n",
    "\n",
    "        # take the log softmax of the action taken previously\n",
    "        game_act_log_softmax_t = F.log_softmax(acts, dim=1)[:,[e.action for e in experience]]\n",
    "\n",
    "        disc_rewards_t = torch.tensor(disc_rewards, dtype=torch.float32).to(device)\n",
    "\n",
    "        # compute the loss entropy\n",
    "        l_entropy = ENTROPY_BETA * torch.mean(torch.sum(F.softmax(acts, dim=1) * F.log_softmax(acts, dim=1), dim=1))\n",
    "\n",
    "        # compute the loss\n",
    "        loss = - torch.mean(disc_rewards_t * game_act_log_softmax_t)\n",
    "        loss = loss + l_entropy\n",
    "\n",
    "        # optimize\n",
    "        optimizer.zero_grad()\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "        # print the stats\n",
    "        writer.add_scalar('loss', loss, n_iter)\n",
    "        writer.add_scalar('reward', game_rew, n_iter)\n",
    "\n",
    "        print(n_games, loss.detach().numpy(), game_rew, np.mean(disc_rewards), np.mean(baseline))\n",
    "\n",
    "        # reset the variables and env\n",
    "        experience = []\n",
    "        game_rew = 0\n",
    "        obs = env.reset()\n",
    "        n_games += 1\n",
    "\n",
    "\n",
    "writer.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Reward](imgs/reward_pg.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
